一 . 概述

使用朴素贝叶斯算法实现垃圾邮件分类。

二 . 数据集

kaggle提供的数据集

三 . 代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer # td-if
from sklearn.model_selection import train_test_split # 用于切割数据
from sklearn.naive_bayes import MultinomialNB # 朴素贝叶斯模型

data = pd.read_csv('spam_ham_dataset.csv') # 载入数据

x = data[['text']] # 特征
y = data[['label']] # 值

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2) # 划分训练数据和测试数据 比例 1:4

print('train data number: {d}'.format(d=len(x_train)))
print('test data number: {d}'.format(d=len(x_test)))

cv = CountVectorizer()
cv.fit(x['text'])
count = cv.transform(x_train['text'])

tfidf = TfidfTransformer()
tfidf.fit(count)
tfidf_matrix = tfidf.transform(count)

model = MultinomialNB()
model.fit(tfidf_matrix, y_train)

model.predict(tfidf.transform(cv.transform(x_test['text'])))

model.score(tfidf.transform(cv.transform(x_test['text'])), y_test)